%--------------------------------------------------------------------------
%                  APYN ANALYSIS PROCEDURE
%-------------------------------------------------------------------------- 
%
% Title: Semi-parametric Selection Models for Potentially Non-ignorable Attrition...
% in Panel Studies with Refreshment Samples
% Manuscript ID PA-2013-094
% Y.SI, J.REITER AND S.HILLYGUS
%-------------------------------------------------------------------------- 
%               Bayesian semi-parametric selection models
%                   joint for (X Y) & W depends on (X Y)
%                        APYN data analysis
%-------------------------------------------------------------------------- 
%%%% Author: Yajuan Si
%%%% Latest edit date: 03/30/2014
%-------------------------------------------------------------------------- 
clear;clc;close all;
tic

%%%%------------------load imputation output---------------------------%%%%
%load the working space and output by imputation
%output from code-for-APYN-imputation.m

load output/apyndata-output.mat 
%%%%-------------convergence check of MCMC samples---------------------%%%%

%hyperparameters
plot(alpha_out)
plot(kout)
plot(w_out)

effituse=50001:50:Trun; %effective sample size

figure(1)
subplot(1,3,1)
plot(alpha_out(effituse));
subplot(1,3,2)
plot(kout(effituse));
subplot(1,3,4)
plot(w_out(effituse))


mean(alpha_out(effituse))
mode(kout(effituse))
mean(w_out(effituse))

%weighted average of cell probabilities
figure(3)
for j=1:6
subplot(3,2,j);
plot(cell_out(effituse,q+p1-1+j)); %36
xlabel('Pr(Y2=1)')
end

for j=1:6
subplot(3,2,j);
plot(y1_out(effituse,j)); 
end
for j=1:6
subplot(3,2,j);
plot(y2_out(effituse,j)); 
end
plot(mean(cell_out),'o')
hold on
plot(mean(data==1),'r*')

mean([y1_out y2_out w_out])

%class allocation weights---do not tell about convergence
for j=1:16
figure(3)
subplot(4,4,j);
plot(pi_out(:,j));
end


%cofficients in the attrition model
for j=1:16
figure(2)    
subplot(4,4,j);
plot(beta_w_out(:,xp+2-j));
end

for j=1:16
figure(2)    
subplot(4,4,j);
plot(beta_w_out(effituse,xp+2-j));
end

[median(beta_w_out(effituse,:))' quantile(beta_w_out(effituse,:),[0.025,0.975],1)']

figure(3)
X_beta=1:(xp+1);
Ybeta=median(beta_w_out(effituse,:));
Lbeta=quantile(beta_w_out(effituse,:),0.025,1);
Ubeta=quantile(beta_w_out(effituse,:),0.975,1);

%posterior summary for beta used for Figure
csvwrite('beta.est.csv',[Ybeta' Lbeta' Ubeta'])



 
%%%%----------------------MI inference------------------------------%%%%

cj=1; %level
varis=q+p1+1:p; %Y2
dis=length(varis);
m1=50;

%marginal prob of Y2 based on combination of panel and refreshment sample
y_q=zeros(m1,dis);y_u=zeros(m1,dis);

for l=1:m1
    micut=300 + 4*l; %save 50 sets for MI
    data = impdata_out(impdata_out(:,1)==micut,2:p+1);
    
    %transfer LV3 to be binay: aggrecate level 1 and 2
    data(data(:,q+p1+3)==2,q+p1+3)=1; 
    %for LV31: a great deal is treated as level 1
    for j=1:dis
        if j==dis
            cj=4;
        else cj=1;
        end 
       y_q(l,j)=mean(data(:,varis(j))==cj);
       y_u(l,j)=y_q(l,j)*(1-y_q(l,j))/N;
       
    end
end  
qpart_y= mean(y_q);
upart_y= mean(y_u);
bpart_y= var(y_q);
%combining rules
Tpart_y=(1+1/m1)*bpart_y+upart_y;
df_y=(m1-1)*((1+upart_y./bpart_y/(1+1/m1)).^2);
low_y = qpart_y -tinv(0.975, df_y) .* sqrt(Tpart_y);
up_y = qpart_y +tinv(0.975, df_y) .* sqrt(Tpart_y);
csvwrite('marprob_y_est.csv',[qpart_y' low_y' up_y'])



part_prob1=zeros(m1,dis); part_prob0=zeros(m1,dis);

part_prob4=zeros(m1,dis); part_prob5=zeros(m1,dis);
part_prob6=zeros(m1,dis); part_prob7=zeros(m1,dis);
part_prob11=zeros(m1,dis); part_prob00=zeros(m1,dis);
part_prob44=zeros(m1,dis); part_prob55=zeros(m1,dis);
part_prob66=zeros(m1,dis); part_prob77=zeros(m1,dis);
part_prob111=zeros(m1,dis); part_prob555=zeros(m1,dis);
upart_prob1=zeros(m1,dis); upart_prob0=zeros(m1,dis);

upart_prob4=zeros(m1,dis); upart_prob5=zeros(m1,dis);
upart_prob6=zeros(m1,dis); upart_prob7=zeros(m1,dis);
upart_prob11=zeros(m1,dis); upart_prob00=zeros(m1,dis);
upart_prob44=zeros(m1,dis); upart_prob55=zeros(m1,dis);
upart_prob66=zeros(m1,dis); upart_prob77=zeros(m1,dis);
upart_prob111=zeros(m1,dis); upart_prob555=zeros(m1,dis);


for l=1:m1
    micut=300 + 4*l;
    data = impdata_out(impdata_out(:,1)==micut,2:p+1);
    W = impdata_out(impdata_out(:,1)==micut,p+2);
    
    data(data(:,q+p1+3)==2,q+p1+3)=1;

    for j=1:dis
      
        if j==dis
            cj=4;
        else cj=1;
        end 
        
    
    tt_panel=crosstab(W(1:Np),data(1:Np,varis(j)));
    part_prob0(l,j)=tt_panel(1,cj)/Np;%joint w=0 in the panel
    part_prob1(l,j)=tt_panel(2,cj)/Np;
    part_prob00(l,j)=tt_panel(1,cj)/Nip; %cond w=0 in the panel
    part_prob11(l,j)=tt_panel(2,cj)/Ncp;
    part_prob111(l,j)=sum(tt_panel(:,cj))/Np;%mar in panel
   
    upart_prob0(l,j)=part_prob0(l,j)*(1-part_prob0(l,j))/Np;
    upart_prob1(l,j)=part_prob1(l,j)*(1-part_prob1(l,j))/Np;
    upart_prob00(l,j)=part_prob00(l,j)*(1-part_prob00(l,j))/Nip;
    upart_prob11(l,j)=part_prob11(l,j)*(1-part_prob11(l,j))/Ncp;
    upart_prob111(l,j)=part_prob111(l,j)*(1-part_prob111(l,j))/Np;
 
    tt_ref=crosstab(W(Np+1:N),data(Np+1:N,varis(j)));
    part_prob4(l,j)=tt_ref(1,cj)/Nr; %joint W=0 ref
    part_prob5(l,j)=tt_ref(2,cj)/Nr;
    part_prob44(l,j)=tt_ref(1,cj)/sum(W(Np+1:N)==0); %cond W=0 ref
    part_prob55(l,j)=tt_ref(2,cj)/sum(W(Np+1:N)==1);
    part_prob555(l,j)=sum(tt_ref(:,cj))/Nr;%mar in ref
     
    upart_prob4(l,j)=part_prob4(l,j)*(1-part_prob4(l,j))/Nr;
    upart_prob5(l,j)=part_prob5(l,j)*(1-part_prob5(l,j))/Nr;
    upart_prob44(l,j)=part_prob44(l,j)*(1-part_prob44(l,j))/sum(W(Np+1:N)==0);
    upart_prob55(l,j)=part_prob55(l,j)*(1-part_prob55(l,j))/sum(W(Np+1:N)==1);
    upart_prob555(l,j)=part_prob555(l,j)*(1-part_prob555(l,j))/Nr; 
   
    tt_all=crosstab(W,data(:,varis(j)));
    part_prob6(l,j)=tt_all(1,cj)/N; %joint w=0 in all
    part_prob7(l,j)=tt_all(2,cj)/N;
    part_prob66(l,j)=tt_all(1,cj)/sum(W==0);%cond w=0 all
    part_prob77(l,j)=tt_all(2,cj)/sum(W==1);

    upart_prob6(l,j)=part_prob6(l,j)*(1-part_prob6(l,j))/N;
    upart_prob7(l,j)=part_prob7(l,j)*(1-part_prob7(l,j))/N;
    upart_prob66(l,j)=part_prob66(l,j)*(1-part_prob66(l,j))/sum(W==0);
    upart_prob77(l,j)=part_prob77(l,j)*(1-part_prob77(l,j))/sum(W==1);
    end

    
end   

%joint
[mean(part_prob1)' mean(part_prob5)'  mean(part_prob7)' mean(part_prob0)' mean(part_prob4)'...
 mean(part_prob6)']

%marginal and conditional

[mean(part_prob111)' mean(part_prob555)' mean(part_prob11)' mean(part_prob55)'...
     mean(part_prob77)' mean(part_prob00)' mean(part_prob44)' mean(part_prob66)']
 
%combing rules
qpart_mar= [mean(part_prob111) mean(part_prob555)];
bpart_mar= [var(part_prob111) var(part_prob555)];
upart_mar= [mean(upart_prob111) mean(upart_prob555)];
Tpart_mar=(1+1/m1)*bpart_mar+upart_mar;
df_mar=(m1-1)*((1+upart_mar./bpart_mar/(1+1/m1)).^2);
low_mar = qpart_mar -tinv(0.975, df_mar) .* sqrt(Tpart_mar);
up_mar = qpart_mar +tinv(0.975, df_mar) .* sqrt(Tpart_mar);

qpart_cd= [mean(part_prob11) mean(part_prob00) mean(part_prob55) mean(part_prob44)];
bpart_cd=  [var(part_prob11) var(part_prob00) var(part_prob55) var(part_prob44)];
upart_cd=  [mean(upart_prob11) mean(upart_prob00) mean(upart_prob55) mean(upart_prob44)];
Tpart_cd=(1+1/m1)*bpart_cd+upart_cd;
df_cd=(m1-1)*((1+upart_cd./bpart_cd/(1+1/m1)).^2);
low_cd = qpart_cd -tinv(0.975, df_cd) .* sqrt(Tpart_cd);
up_cd = qpart_cd +tinv(0.975, df_cd) .* sqrt(Tpart_cd);


qpart_jt= [mean(part_prob77) mean(part_prob66)];
bpart_jt= [var(part_prob77) var(part_prob66)];
upart_jt= [mean(upart_prob77) mean(upart_prob66)];
Tpart_jt=(1+1/m1)*bpart_jt+upart_jt;
df_jt=(m1-1)*((1+upart_jt./bpart_jt/(1+1/m1)).^2);
low_jt = qpart_jt -tinv(0.975, df_jt) .* sqrt(Tpart_jt);
up_jt = qpart_jt +tinv(0.975, df_jt) .* sqrt(Tpart_jt);

%csvwrite('prob_mar_panelandref_est',[qpart_mar' low_mar' up_mar'])
%csvwrite('prob_cd_panelandref_est',[qpart_cd' low_cd' up_cd'])
csvwrite('prob__panelandref_est.csv',[qpart_jt' low_jt' up_jt'])


%%%%---------------MI inference for subgroups-----------------------%%%%
%party1 gender7 race6
np_g_r = d(7) * d(6) * d(1); 
lv_3_p_g_r = zeros(m1, np_g_r); ulv_3_p_g_r = zeros(m1, np_g_r); 

qlv_3=zeros(m1,np_g_r);
ulv_3=zeros(m1,np_g_r);

for l=1:m1
   micut=300 + 4*l;
   data1 = impdata_out(impdata_out(:,1)==micut,2:p+1);
    data = data1(1:Np,:); 
    
    %transfer to let female=1
    data(data(:,7)==1,7)=0;
    data(data(:,7)==2,7)=1;
    data(data(:,7)==0,7)=2;


    lv_3= data(:,q+p1+3); %1
    lv_3(lv_3==3)=0;
    lv_3(lv_3==4)=0;
    lv_3(lv_3==2)=1;


%sex race party
    for i3=1:d(1)
        for i1=1:d(6)
            for i2=1:d(7)
            lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=mean(lv_3(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3));
ulv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)...
    *(1-lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2))...
    /sum(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3);
            end
        end
    end

    qlv_3(l,:)=lv_3_p_g_r(l,:);
    ulv_3(l,:)=ulv_3_p_g_r(l,:);
end


Qbar_lv_3_panel = mean(qlv_3); B_lv_3_panel=var(qlv_3); Ubar_lv_3_panel=mean(ulv_3); 

for l=1:m1
   micut=300 + 4*l;
   data = impdata_out(impdata_out(:,1)==micut,2:p+1);
 
    
    %transfer to let female=1
    data(data(:,7)==1,7)=0;
    data(data(:,7)==2,7)=1;
    data(data(:,7)==0,7)=2;


    lv_3= data(:,q+p1+3); %1
    lv_3(lv_3==3)=0;
    lv_3(lv_3==4)=0;
    lv_3(lv_3==2)=1;


%sex race party
    for i3=1:d(1)
        for i1=1:d(6)
            for i2=1:d(7)
            lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=mean(lv_3(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3));
ulv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)...
    *(1-lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2))...
    /sum(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3);
            end
        end
    end

    qlv_3(l,:)=lv_3_p_g_r(l,:);
    ulv_3(l,:)=ulv_3_p_g_r(l,:);
end


Qbar_lv_3_all = mean(qlv_3); B_lv_3_all=var(qlv_3); Ubar_lv_3_all=mean(ulv_3); 

for l=1:m1
   micut=300 + 4*l;
   data1 = impdata_out(impdata_out(:,1)==micut,2:p+1);
    data = data1(1:Ncp,:); 
    
    %transfer to let female=1
    data(data(:,7)==1,7)=0;
    data(data(:,7)==2,7)=1;
    data(data(:,7)==0,7)=2;


    lv_3= data(:,q+p1+3); %1
    lv_3(lv_3==3)=0;
    lv_3(lv_3==4)=0;
    lv_3(lv_3==2)=1;


%sex race party
    for i3=1:d(1)
        for i1=1:d(6)
            for i2=1:d(7)
            lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=mean(lv_3(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3));
ulv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)=lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2)...
    *(1-lv_3_p_g_r(l,(i3-1)*d(6)*d(7)+(i1-1)*d(7)+i2))...
    /sum(data(:,6)==i1 & data(:,7)==i2 & data(:,1)==i3);
            end
        end
    end

    qlv_3(l,:)=lv_3_p_g_r(l,:);
    ulv_3(l,:)=ulv_3_p_g_r(l,:);
end

Qbar_lv_3_com = mean(qlv_3); B_lv_3_com=var(qlv_3); Ubar_lv_3_com=mean(ulv_3);


Tm_lv_3_panel=(1+1/m1)*B_lv_3_panel + Ubar_lv_3_panel; 

df_lv_3_panel=(m1-1)*((1+Ubar_lv_3_panel./B_lv_3_panel/(1+1/m1)).^2);

Tm_lv_3_all=(1+1/m1)*B_lv_3_all + Ubar_lv_3_all; 

df_lv_3_all=(m1-1)*((1+Ubar_lv_3_all./B_lv_3_all/(1+1/m1)).^2);

Tm_lv_3_com=(1+1/m1)*B_lv_3_com + Ubar_lv_3_com; 

df_lv_3_com=(m1-1)*((1+Ubar_lv_3_com./B_lv_3_com/(1+1/m1)).^2);

low_lv_3_panel = Qbar_lv_3_panel -tinv(0.975, df_lv_3_panel) .* sqrt(Tm_lv_3_panel);
up_lv_3_panel =  Qbar_lv_3_panel +tinv(0.975, df_lv_3_panel) .* sqrt(Tm_lv_3_panel);

low_lv_3_all = Qbar_lv_3_all -tinv(0.975, df_lv_3_all) .* sqrt(Tm_lv_3_all);
up_lv_3_all =  Qbar_lv_3_all +tinv(0.975, df_lv_3_all) .* sqrt(Tm_lv_3_all);


low_lv_3_com = Qbar_lv_3_com -tinv(0.975, df_lv_3_com) .* sqrt(Tm_lv_3_com);
up_lv_3_com =  Qbar_lv_3_com +tinv(0.975, df_lv_3_com) .* sqrt(Tm_lv_3_com);

csvwrite('lv3_subgroup_est.csv', [Qbar_lv_3_all' low_lv_3_all' up_lv_3_all'...
    Qbar_lv_3_panel' low_lv_3_panel' up_lv_3_panel' Qbar_lv_3_com' low_lv_3_com' up_lv_3_com'])



%%%%------------------posterior predictive check-----------------------%%%%

cj=1;
varis=q+p1+1:p; 
dis=length(varis);

com1=zeros(m,dis); com2=zeros(m,dis);
rep1=zeros(m,dis); rep2=zeros(m,dis);
for l=1:m
   data = impdata_out(impdata_out(:,1)==l,2:p+1);
    W = impdata_out(impdata_out(:,1)==l,p+2);

    for j=1:dis  
        
    tt_panel=crosstab(W(1:Np),data(1:Np,varis(j)));
    com2(l,j)=tt_panel(2,cj)/sum(W(1:Np));%cond w=1 in the panel
 
    tt_ref=crosstab(W(Np+1:N),data(Np+1:N,varis(j)));
    com1(l,j)=sum(tt_ref(:,cj))/Nr;%mar in ref
    end

    
end


for l=1:m
    data=repdata_out_1(repdata_out_1(:,1)==l,2:p+1);
    W = repdata_out_1(repdata_out_1(:,1)==l,p+2);
 
    for j=1:dis  
    tt_panel=crosstab(W(1:Np),data(1:Np,varis(j)));
   
    rep2(l,j)=tt_panel(2,cj)/sum(W(1:Np));%cond w=1 in the panel
    
    tt_ref=crosstab(W(Np+1:N),data(Np+1:N,varis(j)));

    rep1(l,j)=sum(tt_ref(:,cj))/Nr;%mar in ref
     
    end
    
end




hist([min(sum(com1>rep1),sum(rep1>com1))*2/m,min(sum(com2>rep2),sum(rep2>com2))*2/m])
xlabel('ppp for marginal probabilities')
ylabel('frequency')

 
